{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ddae88a9-6a18-476a-b90b-53112b62615f",
   "metadata": {},
   "source": [
    "# Connect to Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "56d01c7f-c888-49ae-855f-435f0ffb38ff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'h2QwONxsT4Kt-lTRKmPrhg', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "from elasticsearch import Elasticsearch\n",
    "\n",
    "load_dotenv()\n",
    " \n",
    "openai_api_key=os.getenv('OPENAI_API_KEY')\n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint=os.getenv(\"ES_ENDPOINT\")\n",
    "\n",
    "\n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "client = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    " \n",
    "print(client.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9942b7e-7bd9-4919-bc0d-40d044bb7e4f",
   "metadata": {},
   "source": [
    "# Sample dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3d5d6e20-077b-44a2-a6a5-8c7f0212505a",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = [\n",
    "    {\n",
    "        \"text\": \"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n",
    "        \"metadata\": {\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\", \"director\": \"Steven Spielberg\", \"title\": \"Jurassic Park\"},\n",
    "    },\n",
    "    {\n",
    "        \"text\": \"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n",
    "        \"metadata\": {\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2, \"title\": \"Inception\"},\n",
    "    },\n",
    "    {\n",
    "        \"text\": \"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n",
    "        \"metadata\": {\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6, \"title\": \"Paprika\"},\n",
    "    },\n",
    "    {\n",
    "        \"text\":\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n",
    "        \"metadata\":{\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3, \"title\": \"Little Women\"},\n",
    "    },\n",
    "    {\n",
    "        \"text\":\"Toys come alive and have a blast doing so\",\n",
    "        \"metadata\":{\"year\": 1995, \"genre\": \"animated\", \"director\": \"John Lasseter\", \"rating\": 8.3, \"title\": \"Toy Story\"},\n",
    "    },\n",
    "    {\n",
    "        \"text\":\"Three men walk into the Zone, three men walk out of the Zone\",\n",
    "        \"metadata\":{\n",
    "            \"year\": 1979,\n",
    "            \"rating\": 9.9,\n",
    "            \"director\": \"Andrei Tarkovsky\",\n",
    "            \"genre\": \"science fiction\",\n",
    "            \"rating\": 9.9,\n",
    "            \"title\": \"Stalker\",\n",
    "        }\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc15c8dd-e735-4c42-8c0a-7f3acf0b4e96",
   "metadata": {},
   "source": [
    "# Indexing data into Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8269eb9d-57b2-4bed-9649-bb3ae89293c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from elasticsearch import helpers\n",
    "\n",
    "# create the index\n",
    "client.indices.create(index=\"movies_self_query\")\n",
    "\n",
    "operations = [\n",
    "    {\n",
    "      \"_index\": \"movies_self_query\",\n",
    "      \"_id\": i,\n",
    "      \"text\": doc[\"text\"],\n",
    "      \"metadata\": doc[\"metadata\"]\n",
    "    } for i, doc in enumerate(docs)\n",
    "]\n",
    "\n",
    "# Add the documents to the index directly\n",
    "response = helpers.bulk(\n",
    "  client,\n",
    "  operations,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7875371e-b05b-4f52-86e1-82aedf7595d1",
   "metadata": {},
   "source": [
    "# Setup query retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f13c487e-d510-4d01-a091-54c213846d3c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.llms.openai.OpenAI` was deprecated in langchain-community 0.0.10 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAI`.\n",
      "  warn_deprecated(\n"
     ]
    }
   ],
   "source": [
    "from langchain.vectorstores.elasticsearch import ApproxRetrievalStrategy\n",
    "from typing import List, Union\n",
    "from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
    "from langchain.chains.query_constructor.base import AttributeInfo\n",
    "from langchain.llms import OpenAI\n",
    "from langchain.vectorstores.elasticsearch import ElasticsearchStore\n",
    "\n",
    "# Add details about metadata fields\n",
    "metadata_field_info = [\n",
    "    AttributeInfo(\n",
    "        name=\"genre\",\n",
    "        description=\"The genre of the movie. Can be either 'science fiction' or 'animated'.\",\n",
    "        type=\"string or list[string]\",\n",
    "    ),\n",
    "    AttributeInfo(\n",
    "        name=\"year\",\n",
    "        description=\"The year the movie was released\",\n",
    "        type=\"integer\",\n",
    "    ),\n",
    "    AttributeInfo(\n",
    "        name=\"director\",\n",
    "        description=\"The name of the movie director\",\n",
    "        type=\"string\",\n",
    "    ),\n",
    "    AttributeInfo(\n",
    "        name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n",
    "    ),\n",
    "]\n",
    "\n",
    "document_content_description = \"Brief summary of a movie\"\n",
    "\n",
    "# Set up openAI llm with sampling temperature 0\n",
    "llm = OpenAI(temperature=0, openai_api_key=openai_api_key)\n",
    "\n",
    "class BM25RetrievalStrategy(ApproxRetrievalStrategy):\n",
    "\n",
    "    def __init__(\n",
    "        self\n",
    "    ):\n",
    "        pass\n",
    "\n",
    "    def query(\n",
    "        self,\n",
    "        query: Union[str, None],\n",
    "        filter: List[dict],\n",
    "        **kwargs,\n",
    "    ):\n",
    "                \n",
    "        if query:\n",
    "            query_clause = [{\n",
    "                \"multi_match\": {\n",
    "                    \"query\": query,\n",
    "                    \"fields\": [\"text\"],\n",
    "                    \"fuzziness\": \"AUTO\",\n",
    "                }\n",
    "            }]\n",
    "        else:\n",
    "            query_clause = []\n",
    "\n",
    "\n",
    "        bm25_query = {\n",
    "            \"query\": {\n",
    "                \"bool\": {\n",
    "                    \"filter\": filter,\n",
    "                    \"must\": query_clause\n",
    "                }\n",
    "            },\n",
    "        }\n",
    "\n",
    "        print(\"query\", bm25_query)\n",
    "\n",
    "        return bm25_query\n",
    "\n",
    "\n",
    "vectorstore = ElasticsearchStore(\n",
    "  index_name=\"movies_self_query\",\n",
    "  es_connection=client,\n",
    "  strategy=BM25RetrievalStrategy()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "814a8df1-6c2d-4046-bfb8-6bfb71cd0bcf",
   "metadata": {},
   "source": [
    "# BM25 Only Retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1a2485b6-f5fe-486e-ae15-45d618701cdc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query {'query': {'bool': {'filter': [{'bool': {'must': [{'term': {'metadata.genre.keyword': 'science fiction'}}, {'range': {'metadata.year': {'gt': 1992}}}, {'range': {'metadata.year': {'lt': 2007}}}]}}], 'must': [{'multi_match': {'query': 'dinosaur', 'fields': ['text'], 'fuzziness': 'AUTO'}}]}}}\n",
      "docs: [Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction', 'director': 'Steven Spielberg', 'title': 'Jurassic Park'})]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Steven Spielberg directed Jurassic Park, which was released in 1993.'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.schema.runnable import RunnableParallel, RunnablePassthrough\n",
    "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n",
    "from langchain.schema import format_document\n",
    "\n",
    "retriever = SelfQueryRetriever.from_llm(\n",
    "    llm, \n",
    "    vectorstore, \n",
    "    document_content_description, \n",
    "    metadata_field_info, \n",
    "    verbose=True\n",
    ")\n",
    "\n",
    "LLM_CONTEXT_PROMPT = ChatPromptTemplate.from_template(\"\"\"\n",
    "Use the following context movies that matched the user question. Use the movies below only to answer the user's question.\n",
    "\n",
    "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n",
    "\n",
    "----\n",
    "{context}\n",
    "----\n",
    "Question: {question}\n",
    "Answer:\n",
    "\"\"\")\n",
    "\n",
    "DOCUMENT_PROMPT = PromptTemplate.from_template(\"\"\"\n",
    "---\n",
    "title: {title}                                                                                   \n",
    "year: {year}  \n",
    "director: {director}     \n",
    "---\n",
    "\"\"\")\n",
    "\n",
    "def _combine_documents(\n",
    "    docs, document_prompt=DOCUMENT_PROMPT, document_separator=\"\\n\\n\"\n",
    "):\n",
    "    print(\"docs:\", docs)\n",
    "    doc_strings = [format_document(doc, document_prompt) for doc in docs]\n",
    "    return document_separator.join(doc_strings)\n",
    "\n",
    "\n",
    "_context = RunnableParallel(\n",
    "    context=retriever | _combine_documents,\n",
    "    question=RunnablePassthrough(),\n",
    ")\n",
    "\n",
    "chain = (_context | LLM_CONTEXT_PROMPT | llm)\n",
    "\n",
    "chain.invoke(\"Which director directed movies about dinosaurs that was released after the year 1992 but before 2007?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6f645776-5f23-424e-82bd-0fdfe03b1ef3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# client.indices.delete(index=\"movies_self_query\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0735f9b9-6191-4004-bab3-3ed09ab1abf9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
